rm(list = ls())
library(tidyverse)
library(tidymodels)
library(readr)
library(tswge)
library(ggplot2)
library(yardstick)
acfdf <- function(vec) {
vacf <- acf(vec, plot = F)
with(vacf, data.frame(lag, acf))
}
ggacf <- function(vec) {
ac <- acfdf(vec)
ggplot(data = ac, aes(x = lag, y = acf)) + geom_hline(aes(yintercept = 0)) +
geom_segment(mapping = aes(xend = lag, yend = 0))
}
tplot <- function(vec) {
df <- data.frame(X = vec, t = seq_along(vec))
ggplot(data = df, aes(x = t, y = X)) + geom_line()
}Biostat 212a Homework 6
Due Mar 22, 2024 @ 11:59PM
Load R libraries.
1 New York Stock Exchange (NYSE) data (1962-1986) (140 pts)
The NYSE.csv file contains three daily time series from the New York Stock Exchange (NYSE) for the period Dec 3, 1962-Dec 31, 1986 (6,051 trading days).
Log trading volume(\(v_t\)): This is the fraction of all outstanding shares that are traded on that day, relative to a 100-day moving average of past turnover, on the log scale.Dow Jones return(\(r_t\)): This is the difference between the log of the Dow Jones Industrial Index on consecutive trading days.Log volatility(\(z_t\)): This is based on the absolute values of daily price movements.
# Read in NYSE data from url
url = "https://raw.githubusercontent.com/ucla-biostat-212a/2024winter/master/slides/data/NYSE.csv"
NYSE <- read_csv(url)
NYSE# A tibble: 6,051 × 6
date day_of_week DJ_return log_volume log_volatility train
<date> <chr> <dbl> <dbl> <dbl> <lgl>
1 1962-12-03 mon -0.00446 0.0326 -13.1 TRUE
2 1962-12-04 tues 0.00781 0.346 -11.7 TRUE
3 1962-12-05 wed 0.00384 0.525 -11.7 TRUE
4 1962-12-06 thur -0.00346 0.210 -11.6 TRUE
5 1962-12-07 fri 0.000568 0.0442 -11.7 TRUE
6 1962-12-10 mon -0.0108 0.133 -10.9 TRUE
7 1962-12-11 tues 0.000124 -0.0115 -11.0 TRUE
8 1962-12-12 wed 0.00336 0.00161 -11.0 TRUE
9 1962-12-13 thur -0.00330 -0.106 -11.0 TRUE
10 1962-12-14 fri 0.00447 -0.138 -11.0 TRUE
# ℹ 6,041 more rows
The autocorrelation at lag \(\ell\) is the correlation of all pairs \((v_t, v_{t-\ell})\) that are \(\ell\) trading days apart. These sizable correlations give us confidence that past values will be helpful in predicting the future.
Code
ggacf(NYSE$log_volume) + ggthemes::theme_few()Do a similar plot for (1) the correlation between \(v_t\) and lag \(\ell\) Dow Jones return \(r_{t-\ell}\) and (2) correlation between \(v_t\) and lag \(\ell\) Log volatility \(z_{t-\ell}\).
Code
seq(1, 30) %>%
map(function(x) {cor(NYSE$log_volume , lag(NYSE$DJ_return, x), use = "pairwise.complete.obs")}) %>%
unlist() %>%
tibble(lag = 1:30, cor = .) %>%
ggplot(aes(x = lag, y = cor)) +
geom_hline(aes(yintercept = 0)) +
geom_segment(mapping = aes(xend = lag, yend = 0)) +
ggtitle("AutoCorrelation between `log volume` and lagged `DJ return`")Code
seq(1, 30) %>%
map(function(x) {cor(NYSE$log_volume , lag(NYSE$log_volatility, x), use = "pairwise.complete.obs")}) %>%
unlist() %>%
tibble(lag = 1:30, cor = .) %>%
ggplot(aes(x = lag, y = cor)) +
geom_hline(aes(yintercept = 0)) +
geom_segment(mapping = aes(xend = lag, yend = 0)) +
ggtitle("AutoCorrelation between `log volume` and lagged `log volatility`")1.1 Project goal
Our goal is to forecast daily Log trading volume, using various machine learning algorithms we learnt in this class.
The data set is already split into train (before Jan 1st, 1980, \(n_{\text{train}} = 4,281\)) and test (after Jan 1st, 1980, \(n_{\text{test}} = 1,770\)) sets.
In general, we will tune the lag \(L\) to acheive best forecasting performance. In this project, we would fix \(L=5\). That is we always use the previous five trading days’ data to forecast today’s log trading volume.
Pay attention to the nuance of splitting time series data for cross validation. Study and use the time-series functionality in tidymodels. Make sure to use the same splits when tuning different machine learning algorithms.
Use the \(R^2\) between forecast and actual values as the cross validation and test evaluation criterion.
1.2 Baseline method (20 pts)
We use the straw man (use yesterday’s value of log trading volume to predict that of today) as the baseline method. Evaluate the \(R^2\) of this method on the test data.
Before we tune different machine learning methods, let’s first separate into the test and non-test sets. We drop the first 5 trading days which lack some lagged variables.
# Lag: look back L trading days
# Do not need to include, as we included them in receipe
L = 5
for(i in seq(1, L)) {
NYSE <- NYSE %>%
mutate(!!paste("DJ_return_lag", i, sep = "") := lag(NYSE$DJ_return, i),
!!paste("log_volume_lag", i, sep = "") := lag(NYSE$log_volume, i),
!!paste("log_volatility_lag", i, sep = "") := lag(NYSE$log_volatility, i))
}
NYSE <- NYSE %>% na.omit()# Drop beginning trading days which lack some lagged variables
NYSE_other <- NYSE %>%
filter(train == 'TRUE') %>%
select(-train) %>%
drop_na()
dim(NYSE_other)[1] 4276 20
NYSE_other# A tibble: 4,276 × 20
date day_of_week DJ_return log_volume log_volatility DJ_return_lag1
<date> <chr> <dbl> <dbl> <dbl> <dbl>
1 1962-12-10 mon -0.0108 0.133 -10.9 0.000568
2 1962-12-11 tues 0.000124 -0.0115 -11.0 -0.0108
3 1962-12-12 wed 0.00336 0.00161 -11.0 0.000124
4 1962-12-13 thur -0.00330 -0.106 -11.0 0.00336
5 1962-12-14 fri 0.00447 -0.138 -11.0 -0.00330
6 1962-12-17 mon -0.00402 -0.0496 -11.0 0.00447
7 1962-12-18 tues -0.00832 -0.0434 -10.7 -0.00402
8 1962-12-19 wed 0.0107 0.0536 -10.4 -0.00832
9 1962-12-20 thur 0.00239 0.105 -10.5 0.0107
10 1962-12-21 fri -0.00330 -0.0890 -10.5 0.00239
# ℹ 4,266 more rows
# ℹ 14 more variables: log_volume_lag1 <dbl>, log_volatility_lag1 <dbl>,
# DJ_return_lag2 <dbl>, log_volume_lag2 <dbl>, log_volatility_lag2 <dbl>,
# DJ_return_lag3 <dbl>, log_volume_lag3 <dbl>, log_volatility_lag3 <dbl>,
# DJ_return_lag4 <dbl>, log_volume_lag4 <dbl>, log_volatility_lag4 <dbl>,
# DJ_return_lag5 <dbl>, log_volume_lag5 <dbl>, log_volatility_lag5 <dbl>
NYSE_test = NYSE %>%
filter(train == 'FALSE') %>%
select(-train) %>%
drop_na()
dim(NYSE_test)[1] 1770 20
library(yardstick)
# cor(NYSE_test$log_volume, NYSE_test$log_volume_lag1) %>% round(2)
r2_test_strawman = rsq_vec(NYSE_test$log_volume, lag(NYSE_test$log_volume, 1)) %>% round(2)
print(paste("Straw man test R2: ", r2_test_strawman))[1] "Straw man test R2: 0.35"
1.3 Autoregression (AR) forecaster (30 pts)
Let \[ y = \begin{pmatrix} v_{L+1} \\ v_{L+2} \\ v_{L+3} \\ \vdots \\ v_T \end{pmatrix}, \quad M = \begin{pmatrix} 1 & v_L & v_{L-1} & \cdots & v_1 \\ 1 & v_{L+1} & v_{L} & \cdots & v_2 \\ \vdots & \vdots & \vdots & \ddots & \vdots \\ 1 & v_{T-1} & v_{T-2} & \cdots & v_{T-L} \end{pmatrix}. \]
Fit an ordinary least squares (OLS) regression of \(y\) on \(M\), giving \[ \hat v_t = \hat \beta_0 + \hat \beta_1 v_{t-1} + \hat \beta_2 v_{t-2} + \cdots + \hat \beta_L v_{t-L}, \] known as an order-\(L\) autoregression model or AR(\(L\)).
Before we start the model training, let’s talk about time series resampling. We will use the
rolling_originfunction in thersamplepackage to create a time series cross-validation plan.When the data have a strong time component, a resampling method should support modeling to estimate seasonal and other temporal trends within the data. A technique that randomly samples values from the training set can disrupt the model’s ability to estimate these patterns.
NYSE %>%
ggplot(aes(x = date, y = log_volume)) +
geom_line() +
geom_smooth(method = "lm")correct_split <- initial_time_split(NYSE_other %>% arrange(date))
bind_rows(
training(correct_split) %>% mutate(type = "train"),
testing(correct_split) %>% mutate(type = "test")
) %>%
ggplot(aes(x = date, y = log_volume, color = type, group = NA)) +
geom_line()rolling_origin(NYSE_other %>% arrange(date), initial = 30, assess = 7) %>%
#sliding_period(NYSE_other %>% arrange(date), date, period = "day", lookback = Inf, assess_stop = 1) %>%
mutate(train_data = map(splits, analysis),
test_data = map(splits, assessment)) %>%
select(-splits) %>%
pivot_longer(-id) %>%
filter(id %in% c("Slice0001", "Slice0002", "Slice0003")) %>%
unnest(value) %>%
ggplot(aes(x = date, y = log_volume, color = name, group = NA)) +
geom_point() +
geom_line() +
facet_wrap(~id, scales = "fixed")sliding_period(NYSE_other %>% arrange(date),
date, period = "month", lookback = Inf, assess_stop = 1) %>%
mutate(train_data = map(splits, analysis),
test_data = map(splits, assessment)) %>%
select(-splits) %>%
pivot_longer(-id) %>%
filter(id %in% c("Slice001", "Slice002", "Slice003")) %>%
unnest(value) %>%
ggplot(aes(x = date, y = log_volume, color = name, group = NA)) +
geom_point() +
geom_line() +
facet_wrap(~id, scales = "fixed")Rolling forecast origin resampling (Hyndman and Athanasopoulos 2018) provides a method that emulates how time series data is often partitioned in practice, estimating the model with historical data and evaluating it with the most recent data.
Tune AR(5) with elastic net (lasso + ridge) regularization using all 3 features on the training data, and evaluate the test performance.
1.4 Preprocessing
en_receipe <-
recipe(log_volume ~ log_volume_lag1 + DJ_return_lag1 + log_volatility_lag1 + log_volume_lag2 + DJ_return_lag2 + log_volatility_lag2 + log_volume_lag3 + DJ_return_lag3 + log_volatility_lag3 + log_volume_lag4 + DJ_return_lag4 + log_volatility_lag4 + log_volume_lag5 + DJ_return_lag5 + log_volatility_lag5 + date, data = NYSE_other) %>%
step_dummy(all_nominal(), -all_outcomes()) %>%
step_normalize(all_numeric_predictors(), -all_outcomes()) %>%
step_naomit(all_predictors()) %>%
prep(data = NYSE_other)
en_receipe1.5 Model training
###AR(5) with elastic net (lasso + ridge) regularization
### Model
enet_mod <-
# mixture = 0 (ridge), mixture = 1 (lasso)
# mixture = (0, 1) elastic net
# As an example, we set mixture = 0.5. It needs to be tuned.
linear_reg(penalty = tune(), mixture = 0.5) %>%
set_engine("glmnet")
enet_modLinear Regression Model Specification (regression)
Main Arguments:
penalty = tune()
mixture = 0.5
Computational engine: glmnet
en_wf <-
workflow() %>%
add_model(enet_mod) %>%
add_recipe(en_receipe %>% step_rm(date) %>% step_indicate_na())
en_wf══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: linear_reg()
── Preprocessor ────────────────────────────────────────────────────────────────
5 Recipe Steps
• step_dummy()
• step_normalize()
• step_naomit()
• step_rm()
• step_indicate_na()
── Model ───────────────────────────────────────────────────────────────────────
Linear Regression Model Specification (regression)
Main Arguments:
penalty = tune()
mixture = 0.5
Computational engine: glmnet
folds <- NYSE_other %>% arrange(date) %>%
sliding_period(date, period = "month", lookback = Inf, assess_stop = 1)
# rolling_origin(initial = 5, assess = 1)
folds# Sliding period resampling
# A tibble: 204 × 2
splits id
<list> <chr>
1 <split [15/22]> Slice001
2 <split [37/19]> Slice002
3 <split [56/21]> Slice003
4 <split [77/21]> Slice004
5 <split [98/22]> Slice005
6 <split [120/20]> Slice006
7 <split [140/22]> Slice007
8 <split [162/22]> Slice008
9 <split [184/20]> Slice009
10 <split [204/23]> Slice010
# ℹ 194 more rows
month_folds <- NYSE_other %>%
sliding_period(
date,
"month",
lookback = Inf,
skip = 4)
month_folds# Sliding period resampling
# A tibble: 200 × 2
splits id
<list> <chr>
1 <split [98/22]> Slice001
2 <split [120/20]> Slice002
3 <split [140/22]> Slice003
4 <split [162/22]> Slice004
5 <split [184/20]> Slice005
6 <split [204/23]> Slice006
7 <split [227/18]> Slice007
8 <split [245/21]> Slice008
9 <split [266/22]> Slice009
10 <split [288/19]> Slice010
# ℹ 190 more rows
lambda_grid <-
grid_regular(penalty(range = c(-8, -7), trans = log10_trans()), levels = 3)
lambda_grid# A tibble: 3 × 1
penalty
<dbl>
1 0.00000001
2 0.0000000316
3 0.0000001
en_fit <- tune_grid(en_wf, resamples = month_folds, grid = lambda_grid)
en_fit# Tuning results
# Sliding period resampling
# A tibble: 200 × 4
splits id .metrics .notes
<list> <chr> <list> <list>
1 <split [98/22]> Slice001 <tibble [6 × 5]> <tibble [0 × 3]>
2 <split [120/20]> Slice002 <tibble [6 × 5]> <tibble [0 × 3]>
3 <split [140/22]> Slice003 <tibble [6 × 5]> <tibble [0 × 3]>
4 <split [162/22]> Slice004 <tibble [6 × 5]> <tibble [0 × 3]>
5 <split [184/20]> Slice005 <tibble [6 × 5]> <tibble [0 × 3]>
6 <split [204/23]> Slice006 <tibble [6 × 5]> <tibble [0 × 3]>
7 <split [227/18]> Slice007 <tibble [6 × 5]> <tibble [0 × 3]>
8 <split [245/21]> Slice008 <tibble [6 × 5]> <tibble [0 × 3]>
9 <split [266/22]> Slice009 <tibble [6 × 5]> <tibble [0 × 3]>
10 <split [288/19]> Slice010 <tibble [6 × 5]> <tibble [0 × 3]>
# ℹ 190 more rows
en_fit %>%
collect_metrics() %>%
print(width = Inf) %>%
filter(.metric == "rsq") %>%
ggplot(mapping = aes(x = penalty, y = mean)) +
geom_point() +
geom_line() +
labs(x = "Penalty", y = "CV RMSE") +
scale_x_log10(labels = scales::label_number())# A tibble: 6 × 7
penalty .metric .estimator mean n std_err .config
<dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 0.00000001 rmse standard 0.148 200 0.00292 Preprocessor1_Model1
2 0.00000001 rsq standard 0.250 200 0.0127 Preprocessor1_Model1
3 0.0000000316 rmse standard 0.148 200 0.00292 Preprocessor1_Model2
4 0.0000000316 rsq standard 0.250 200 0.0127 Preprocessor1_Model2
5 0.0000001 rmse standard 0.148 200 0.00292 Preprocessor1_Model3
6 0.0000001 rsq standard 0.250 200 0.0127 Preprocessor1_Model3
en_fit %>%
show_best("rsq")# A tibble: 3 × 7
penalty .metric .estimator mean n std_err .config
<dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 0.00000001 rsq standard 0.250 200 0.0127 Preprocessor1_Model1
2 0.0000000316 rsq standard 0.250 200 0.0127 Preprocessor1_Model2
3 0.0000001 rsq standard 0.250 200 0.0127 Preprocessor1_Model3
best_en_fit <- en_fit %>%
select_best("rsq")
best_en_fit# A tibble: 1 × 2
penalty .config
<dbl> <chr>
1 0.00000001 Preprocessor1_Model1
# Final workflow
final_wf_en <- en_wf %>%
finalize_workflow(best_en_fit)
final_wf_en══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: linear_reg()
── Preprocessor ────────────────────────────────────────────────────────────────
5 Recipe Steps
• step_dummy()
• step_normalize()
• step_naomit()
• step_rm()
• step_indicate_na()
── Model ───────────────────────────────────────────────────────────────────────
Linear Regression Model Specification (regression)
Main Arguments:
penalty = 1e-08
mixture = 0.5
Computational engine: glmnet
# Fit the whole training set, then predict the test cases
final_fit_en <-
final_wf_en %>%
last_fit(correct_split)
final_fit_en# Resampling results
# Manual resampling
# A tibble: 1 × 6
splits id .metrics .notes .predictions .workflow
<list> <chr> <list> <list> <list> <list>
1 <split [3207/1069]> train/test split <tibble> <tibble> <tibble> <workflow>
# Test metrics
final_fit_en %>% collect_metrics()# A tibble: 2 × 4
.metric .estimator .estimate .config
<chr> <chr> <dbl> <chr>
1 rmse standard 0.156 Preprocessor1_Model1
2 rsq standard 0.599 Preprocessor1_Model1
predictions <- final_fit_en %>%
collect_predictions()
predictions# A tibble: 1,069 × 5
id .pred .row log_volume .config
<chr> <dbl> <int> <dbl> <chr>
1 train/test split -0.196 3208 0.0353 Preprocessor1_Model1
2 train/test split -0.0383 3209 0.0338 Preprocessor1_Model1
3 train/test split -0.0636 3210 -0.141 Preprocessor1_Model1
4 train/test split -0.117 3211 -0.352 Preprocessor1_Model1
5 train/test split -0.146 3212 0.154 Preprocessor1_Model1
6 train/test split 0.0169 3213 -0.167 Preprocessor1_Model1
7 train/test split -0.120 3214 0.102 Preprocessor1_Model1
8 train/test split -0.0228 3215 -0.0838 Preprocessor1_Model1
9 train/test split -0.0910 3216 -0.247 Preprocessor1_Model1
10 train/test split -0.0873 3217 0.205 Preprocessor1_Model1
# ℹ 1,059 more rows
- Hint: Workflow: Lasso is a good starting point.
1.6 Random forest forecaster (30pts)
Use the same features as in AR(\(L\)) for the random forest. Tune the random forest and evaluate the test performance.
Hint: Workflow: Random Forest for Prediction is a good starting point.
rf_recipe <- recipe(log_volume ~ log_volume_lag1 + DJ_return_lag1 + log_volatility_lag1 + log_volume_lag2 + DJ_return_lag2 + log_volatility_lag2 + log_volume_lag3 + DJ_return_lag3 + log_volatility_lag3 + log_volume_lag4 + DJ_return_lag4 + log_volatility_lag4 + log_volume_lag5 + DJ_return_lag5 + log_volatility_lag5 + date, data = NYSE_other) %>%
step_dummy(all_nominal(), -all_outcomes()) %>%
step_normalize(all_numeric_predictors(), -all_outcomes()) %>%
step_naomit(all_predictors()) %>%
prep(data = NYSE_other)
rf_reciperf_mod <-
rand_forest(
mode = "regression",
# Number of predictors randomly sampled in each split
mtry = tune(),
# Number of trees in ensemble
trees = tune()
) %>%
set_engine("ranger")
rf_modRandom Forest Model Specification (regression)
Main Arguments:
mtry = tune()
trees = tune()
Computational engine: ranger
rf_wf <- workflow() %>%
add_recipe(rf_recipe %>% step_rm(date) %>% step_indicate_na()) %>%
add_model(rf_mod)
rf_wf══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: rand_forest()
── Preprocessor ────────────────────────────────────────────────────────────────
5 Recipe Steps
• step_dummy()
• step_normalize()
• step_naomit()
• step_rm()
• step_indicate_na()
── Model ───────────────────────────────────────────────────────────────────────
Random Forest Model Specification (regression)
Main Arguments:
mtry = tune()
trees = tune()
Computational engine: ranger
folds <- NYSE_other %>% arrange(date) %>%
sliding_period(date, period = "month", lookback = Inf, assess_stop = 1)
# rolling_origin(initial = 5, assess = 1)
folds# Sliding period resampling
# A tibble: 204 × 2
splits id
<list> <chr>
1 <split [15/22]> Slice001
2 <split [37/19]> Slice002
3 <split [56/21]> Slice003
4 <split [77/21]> Slice004
5 <split [98/22]> Slice005
6 <split [120/20]> Slice006
7 <split [140/22]> Slice007
8 <split [162/22]> Slice008
9 <split [184/20]> Slice009
10 <split [204/23]> Slice010
# ℹ 194 more rows
month_folds <- NYSE_other %>%
sliding_period(
date,
"month",
lookback = Inf,
skip = 4)
month_folds# Sliding period resampling
# A tibble: 200 × 2
splits id
<list> <chr>
1 <split [98/22]> Slice001
2 <split [120/20]> Slice002
3 <split [140/22]> Slice003
4 <split [162/22]> Slice004
5 <split [184/20]> Slice005
6 <split [204/23]> Slice006
7 <split [227/18]> Slice007
8 <split [245/21]> Slice008
9 <split [266/22]> Slice009
10 <split [288/19]> Slice010
# ℹ 190 more rows
rf_grid <- grid_regular(
trees(range = c(100L, 300L)),
mtry(range = c(1L, 5L)),
levels = c(3, 5)
)
rf_grid# A tibble: 15 × 2
trees mtry
<int> <int>
1 100 1
2 200 1
3 300 1
4 100 2
5 200 2
6 300 2
7 100 3
8 200 3
9 300 3
10 100 4
11 200 4
12 300 4
13 100 5
14 200 5
15 300 5
rf_fit <- tune_grid(
rf_wf,
resamples = month_folds,
grid = rf_grid,
metrics = metric_set(rmse, rsq)
)
rf_fit# Tuning results
# Sliding period resampling
# A tibble: 200 × 4
splits id .metrics .notes
<list> <chr> <list> <list>
1 <split [98/22]> Slice001 <tibble [30 × 6]> <tibble [0 × 3]>
2 <split [120/20]> Slice002 <tibble [30 × 6]> <tibble [0 × 3]>
3 <split [140/22]> Slice003 <tibble [30 × 6]> <tibble [0 × 3]>
4 <split [162/22]> Slice004 <tibble [30 × 6]> <tibble [0 × 3]>
5 <split [184/20]> Slice005 <tibble [30 × 6]> <tibble [0 × 3]>
6 <split [204/23]> Slice006 <tibble [30 × 6]> <tibble [0 × 3]>
7 <split [227/18]> Slice007 <tibble [30 × 6]> <tibble [0 × 3]>
8 <split [245/21]> Slice008 <tibble [30 × 6]> <tibble [0 × 3]>
9 <split [266/22]> Slice009 <tibble [30 × 6]> <tibble [0 × 3]>
10 <split [288/19]> Slice010 <tibble [30 × 6]> <tibble [0 × 3]>
# ℹ 190 more rows
rf_fit %>%
collect_metrics() %>%
print(width = Inf) %>%
filter(.metric == "rsq") %>%
mutate(mtry = as.factor(mtry)) %>%
ggplot(mapping = aes(x = trees, y = mean, color = mtry)) +
# geom_point() +
geom_line() +
labs(x = "Num. of Trees", y = "CV mse")# A tibble: 30 × 8
mtry trees .metric .estimator mean n std_err .config
<int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 1 100 rmse standard 0.160 200 0.00358 Preprocessor1_Model01
2 1 100 rsq standard 0.214 200 0.0126 Preprocessor1_Model01
3 1 200 rmse standard 0.159 200 0.00354 Preprocessor1_Model02
4 1 200 rsq standard 0.225 200 0.0128 Preprocessor1_Model02
5 1 300 rmse standard 0.159 200 0.00355 Preprocessor1_Model03
6 1 300 rsq standard 0.222 200 0.0127 Preprocessor1_Model03
7 2 100 rmse standard 0.154 200 0.00327 Preprocessor1_Model04
8 2 100 rsq standard 0.235 200 0.0129 Preprocessor1_Model04
9 2 200 rmse standard 0.153 200 0.00324 Preprocessor1_Model05
10 2 200 rsq standard 0.241 200 0.0129 Preprocessor1_Model05
# ℹ 20 more rows
# Final workflow
rf_fit %>%
show_best("rsq")# A tibble: 5 × 8
mtry trees .metric .estimator mean n std_err .config
<int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 5 200 rsq standard 0.250 200 0.0128 Preprocessor1_Model14
2 5 300 rsq standard 0.248 200 0.0128 Preprocessor1_Model15
3 5 100 rsq standard 0.248 200 0.0128 Preprocessor1_Model13
4 4 300 rsq standard 0.248 200 0.0128 Preprocessor1_Model12
5 4 200 rsq standard 0.248 200 0.0128 Preprocessor1_Model11
best_rf <- rf_fit %>%
select_best("rsq")
best_rf# A tibble: 1 × 3
mtry trees .config
<int> <int> <chr>
1 5 200 Preprocessor1_Model14
# Final workflow
final_wf_rf <- rf_wf %>%
finalize_workflow(best_rf)
final_wf_rf══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: rand_forest()
── Preprocessor ────────────────────────────────────────────────────────────────
5 Recipe Steps
• step_dummy()
• step_normalize()
• step_naomit()
• step_rm()
• step_indicate_na()
── Model ───────────────────────────────────────────────────────────────────────
Random Forest Model Specification (regression)
Main Arguments:
mtry = 5
trees = 200
Computational engine: ranger
# Fit the whole training set, then predict the test cases
final_fit_rf <-
final_wf_rf %>%
last_fit(correct_split)
final_fit_rf# Resampling results
# Manual resampling
# A tibble: 1 × 6
splits id .metrics .notes .predictions .workflow
<list> <chr> <list> <list> <list> <list>
1 <split [3207/1069]> train/test split <tibble> <tibble> <tibble> <workflow>
# Test metrics
final_fit_rf %>%
collect_metrics()# A tibble: 2 × 4
.metric .estimator .estimate .config
<chr> <chr> <dbl> <chr>
1 rmse standard 0.162 Preprocessor1_Model1
2 rsq standard 0.579 Preprocessor1_Model1
1.7 Boosting forecaster (30pts)
Use the same features as in AR(\(L\)) for the boosting. Tune the boosting algorithm and evaluate the test performance.
Hint: Workflow: Boosting tree for Prediction is a good starting point.
boost_recipe <- recipe(log_volume ~ log_volume_lag1 + DJ_return_lag1 + log_volatility_lag1 + log_volume_lag2 + DJ_return_lag2 + log_volatility_lag2 + log_volume_lag3 + DJ_return_lag3 + log_volatility_lag3 + log_volume_lag4 + DJ_return_lag4 + log_volatility_lag4 + log_volume_lag5 + DJ_return_lag5 + log_volatility_lag5 + date, data = NYSE_other) %>%
step_dummy(all_nominal(), -all_outcomes()) %>%
step_normalize(all_numeric_predictors(), -all_outcomes()) %>%
step_naomit(all_predictors()) %>%
prep(data = NYSE_other)
boost_recipegb_mod <-
boost_tree(
mode = "regression",
trees = 1000,
tree_depth = tune(),
learn_rate = tune()
) %>%
set_engine("xgboost")
gb_modBoosted Tree Model Specification (regression)
Main Arguments:
trees = 1000
tree_depth = tune()
learn_rate = tune()
Computational engine: xgboost
boost_workflow <- workflow() %>%
add_model(gb_mod) %>%
add_recipe(boost_recipe %>% step_rm(date) %>% step_indicate_na())
boost_workflow══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: boost_tree()
── Preprocessor ────────────────────────────────────────────────────────────────
5 Recipe Steps
• step_dummy()
• step_normalize()
• step_naomit()
• step_rm()
• step_indicate_na()
── Model ───────────────────────────────────────────────────────────────────────
Boosted Tree Model Specification (regression)
Main Arguments:
trees = 1000
tree_depth = tune()
learn_rate = tune()
Computational engine: xgboost
folds <- NYSE_other %>% arrange(date) %>%
sliding_period(date, period = "month", lookback = Inf, assess_stop = 1)
# rolling_origin(initial = 5, assess = 1)
month_folds <- NYSE_other %>%
sliding_period(
date,
"month",
lookback = Inf,
skip = 4)boost_grid <- grid_regular(
tree_depth(range = c(1L, 2L)),
learn_rate(range = c(-3, -2), trans = log10_trans()),
levels = c(2, 3)
)
boost_grid# A tibble: 6 × 2
tree_depth learn_rate
<int> <dbl>
1 1 0.001
2 2 0.001
3 1 0.00316
4 2 0.00316
5 1 0.01
6 2 0.01
boost_fit <- tune_grid(
boost_workflow,
resamples = month_folds,
grid = boost_grid,
metrics = metric_set(rmse, rsq)
)
boost_fit# Tuning results
# Sliding period resampling
# A tibble: 200 × 4
splits id .metrics .notes
<list> <chr> <list> <list>
1 <split [98/22]> Slice001 <tibble [12 × 6]> <tibble [0 × 3]>
2 <split [120/20]> Slice002 <tibble [12 × 6]> <tibble [0 × 3]>
3 <split [140/22]> Slice003 <tibble [12 × 6]> <tibble [0 × 3]>
4 <split [162/22]> Slice004 <tibble [12 × 6]> <tibble [0 × 3]>
5 <split [184/20]> Slice005 <tibble [12 × 6]> <tibble [0 × 3]>
6 <split [204/23]> Slice006 <tibble [12 × 6]> <tibble [0 × 3]>
7 <split [227/18]> Slice007 <tibble [12 × 6]> <tibble [0 × 3]>
8 <split [245/21]> Slice008 <tibble [12 × 6]> <tibble [0 × 3]>
9 <split [266/22]> Slice009 <tibble [12 × 6]> <tibble [0 × 3]>
10 <split [288/19]> Slice010 <tibble [12 × 6]> <tibble [0 × 3]>
# ℹ 190 more rows
boost_fit %>%
collect_metrics() %>%
print(width = Inf) %>%
filter(.metric == "rsq") %>%
ggplot(mapping = aes(x = learn_rate, y = mean, color = factor(tree_depth))) +
geom_point() +
geom_line() +
labs(x = "Learning Rate", y = "CV AUC") +
scale_x_log10()# A tibble: 12 × 8
tree_depth learn_rate .metric .estimator mean n std_err
<int> <dbl> <chr> <chr> <dbl> <int> <dbl>
1 1 0.001 rmse standard 0.257 200 0.00562
2 1 0.001 rsq standard 0.203 200 0.0120
3 2 0.001 rmse standard 0.251 200 0.00492
4 2 0.001 rsq standard 0.221 200 0.0122
5 1 0.00316 rmse standard 0.162 200 0.00348
6 1 0.00316 rsq standard 0.221 200 0.0121
7 2 0.00316 rmse standard 0.155 200 0.00311
8 2 0.00316 rsq standard 0.237 200 0.0127
9 1 0.01 rmse standard 0.152 200 0.00308
10 1 0.01 rsq standard 0.243 200 0.0125
11 2 0.01 rmse standard 0.150 200 0.00297
12 2 0.01 rsq standard 0.256 200 0.0130
.config
<chr>
1 Preprocessor1_Model1
2 Preprocessor1_Model1
3 Preprocessor1_Model2
4 Preprocessor1_Model2
5 Preprocessor1_Model3
6 Preprocessor1_Model3
7 Preprocessor1_Model4
8 Preprocessor1_Model4
9 Preprocessor1_Model5
10 Preprocessor1_Model5
11 Preprocessor1_Model6
12 Preprocessor1_Model6
boost_fit %>%
show_best("rsq")# A tibble: 5 × 8
tree_depth learn_rate .metric .estimator mean n std_err .config
<int> <dbl> <chr> <chr> <dbl> <int> <dbl> <chr>
1 2 0.01 rsq standard 0.256 200 0.0130 Preprocessor1_Mo…
2 1 0.01 rsq standard 0.243 200 0.0125 Preprocessor1_Mo…
3 2 0.00316 rsq standard 0.237 200 0.0127 Preprocessor1_Mo…
4 2 0.001 rsq standard 0.221 200 0.0122 Preprocessor1_Mo…
5 1 0.00316 rsq standard 0.221 200 0.0121 Preprocessor1_Mo…
best_gb <- boost_fit %>%
select_best("rsq")
best_gb# A tibble: 1 × 3
tree_depth learn_rate .config
<int> <dbl> <chr>
1 2 0.01 Preprocessor1_Model6
# Final workflow
final_wf_boost <- boost_workflow %>%
finalize_workflow(best_gb)
final_wf_boost══ Workflow ════════════════════════════════════════════════════════════════════
Preprocessor: Recipe
Model: boost_tree()
── Preprocessor ────────────────────────────────────────────────────────────────
5 Recipe Steps
• step_dummy()
• step_normalize()
• step_naomit()
• step_rm()
• step_indicate_na()
── Model ───────────────────────────────────────────────────────────────────────
Boosted Tree Model Specification (regression)
Main Arguments:
trees = 1000
tree_depth = 2
learn_rate = 0.01
Computational engine: xgboost
# Fit the whole training set, then predict the test cases
final_fit_boost <-
final_wf_boost %>%
last_fit(correct_split)
final_fit_boost# Resampling results
# Manual resampling
# A tibble: 1 × 6
splits id .metrics .notes .predictions .workflow
<list> <chr> <list> <list> <list> <list>
1 <split [3207/1069]> train/test split <tibble> <tibble> <tibble> <workflow>
# Test metrics
final_fit_boost %>%
collect_metrics()# A tibble: 2 × 4
.metric .estimator .estimate .config
<chr> <chr> <dbl> <chr>
1 rmse standard 0.159 Preprocessor1_Model1
2 rsq standard 0.595 Preprocessor1_Model1
1.8 Summary (30pts)
Your score for this question is largely determined by your final test performance.
Summarize the performance of different machine learning forecasters in the following format.
The summary of the performance of various machine learning forecasters on predicting the daily Log trading volume using the New York Stock Exchange data from 1962 to 1986, with training data up to January 2, 1980, is presented below. The evaluation of model performance was based on the coefficient of determination (R^2), both on cross-validation (CV) and on the unseen test data. The R^2 metric measures the proportion of variance in the dependent variable that is predictable from the independent variables, with values closer to 1 indicating better model performance. | Method | CV \(R^2\) | Test \(R^2\) | |:——:|:——:|:——:|:——:| | Baseline | - |Straw man test R2: 0.35 | | AR(5) | 0.250 | 0.599 | | Random Forest | 0.253| 0.583| | Boosting | 0.256| 0.595| Here’s a summary of the test performance based on the \(R^2\) values provided: Baseline (Straw Man Test): The baseline method, referred to as “Straw man test,” has an \(R^2\) of 0.35 on the test set. This gives us a basic level of performance against which to compare the other methods. The baseline method doesn’t have a reported CV \(R^2\) value. AR(5) (Autoregressive Model with Lag 5): This model has a CV \(R^2\) of 0.250 and a test \(R^2\) of 0.599. It significantly outperforms the baseline on the test dataset, indicating it can explain a greater proportion of the variance. Random Forest: This method shows a CV \(R^2\) of 0.253 and a test \(R^2\) of 0.583. It’s a competitive model that also outperforms the baseline, though it slightly lags behind the AR(5) model on the test dataset. Boosting: The boosting model has a CV \(R^2\) of 0.256 and a test \(R^2\) of 0.595. It’s very close to the AR(5) model in terms of performance on the test dataset and represents a slight improvement over the Random Forest model. In evaluating these results: Cross-Validation Performance: The cross-validation \(R^2\) values are relatively close for the AR(5), Random Forest, and Boosting methods, suggesting that they have similar predictive capabilities based on the training data. Boosting shows a marginally higher CV \(R^2\), suggesting it might generalize slightly better than the others. Test Performance: On the unseen test data, the AR(5) model has the highest \(R^2\) value (0.599), closely followed by Boosting (0.595), and then Random Forest (0.583). These differences are not substantial but indicate that, in this specific case, the AR(5) and Boosting models are slightly more effective at predicting the dependent variable. Overall, while all three advanced methods (AR(5), Random Forest, and Boosting) demonstrate a significant improvement over the baseline, the AR(5) and Boosting models show the best performance on the test dataset. The choice between them might depend on additional considerations like model complexity, interpretability, and computational resources.
1.9 Extension reading
2 ISL Exercise 12.6.13 (90 pts)
- On the book website, www.statlearning.com, there is a gene expression data set (Ch12Ex13.csv) that consists of 40 tissue samples with measurements on 1,000 genes. The first 20 samples are from healthy patients, while the second 20 are from a diseased group. Load in the data using read.csv(). You will need to select header = F.
- Apply hierarchical clustering to the samples using correlation- based distance, and plot the dendrogram. Do the genes separate the samples into the two groups? Do your results depend on the type of linkage used? PCA and UMAP
- Your collaborator wants to know which genes differ the most across the two groups. Suggest a way to answer this question, and apply it here.
import data
library(tidyverse)
library(cluster)
library(ggplot2)
# Load the data
gene_express <- read.csv('../hw6/Ch12Ex13.csv', header = FALSE, col.names = paste("ID", 1:40, sep = ""))
# Few top rows of the dataset:
head(gene_express) ID1 ID2 ID3 ID4 ID5 ID6
1 -0.96193340 0.4418028 -0.9750051 1.4175040 0.8188148 0.3162937
2 -0.29252570 -1.1392670 0.1958370 -1.2811210 -0.2514393 2.5119970
3 0.25878820 -0.9728448 0.5884858 -0.8002581 -1.8203980 -2.0589240
4 -1.15213200 -2.2131680 -0.8615249 0.6309253 0.9517719 -1.1657240
5 0.19578280 0.5933059 0.2829921 0.2471472 1.9786680 -0.8710180
6 0.03012394 -0.6910143 -0.4034258 -0.7298590 -0.3640986 1.1253490
ID7 ID8 ID9 ID10 ID11 ID12
1 -0.02496682 -0.06396600 0.03149702 -0.3503106 -0.7227299 -0.2819547
2 -0.92220620 0.05954277 -1.40964500 -0.6567122 -0.1157652 0.8259783
3 -0.06476437 1.59212400 -0.17311700 -0.1210874 -0.1875790 -1.5001630
4 -0.39155860 1.06361900 -0.35000900 -1.4890580 -0.2432189 -0.4330340
5 -0.98971500 -1.03225300 -1.10965400 -0.3851423 1.6509570 -1.7449090
6 -1.40404100 -0.80613040 -1.23792400 0.5776018 -0.2720642 2.1765620
ID13 ID14 ID15 ID16 ID17 ID18
1 1.33751500 0.70197980 1.0076160 -0.4653828 0.6385951 0.2867807
2 0.34644960 -0.56954860 -0.1315365 0.6902290 -0.9090382 1.3026420
3 -1.22873700 0.85598900 1.2498550 -0.8980815 0.8702058 -0.2252529
4 -0.03879128 -0.05789677 -1.3977620 -0.1561871 -2.7359820 0.7756169
5 -0.37888530 -0.67982610 -2.1315840 -0.2301718 0.4661243 -1.8004490
6 1.43640700 -1.02578100 0.2981582 -0.5559659 0.2046529 -1.1916480
ID19 ID20 ID21 ID22 ID23 ID24
1 -0.2270782 -0.22004520 -1.2425730 -0.1085056 -1.8642620 -0.5005122
2 -1.6726950 -0.52550400 0.7979700 -0.6897930 0.8995305 0.4285812
3 0.4502892 0.55144040 0.1462943 0.1297400 1.3042290 -1.6619080
4 0.6141562 2.01919400 1.0811390 -1.0766180 -0.2434181 0.5134822
5 0.6262904 -0.09772305 -0.2997108 -0.5295591 -2.0235670 -0.5108402
6 0.2350916 0.67096470 0.1307988 1.0689940 1.2309870 1.1344690
ID25 ID26 ID27 ID28 ID29 ID30
1 -1.32500800 1.06341100 -0.2963712 -0.1216457 0.08516605 0.62417640
2 -0.67611410 -0.53409490 -1.7325070 -1.6034470 -1.08362000 0.03342185
3 -1.63037600 -0.07742528 1.3061820 0.7926002 1.55946500 -0.68851160
4 -0.51285780 2.55167600 -2.3143010 -1.2764700 -1.22927100 1.43439600
5 0.04600274 1.26803000 -0.7439868 0.2231319 0.85846280 0.27472610
6 0.55636800 -0.35876640 1.0798650 -0.2064905 -0.00616453 0.16425470
ID31 ID32 ID33 ID34 ID35 ID36
1 -0.5095915 -0.216725500 -0.05550597 -0.4844491 -0.5215811 1.9491350
2 1.7007080 0.007289556 0.09906234 0.5638533 -0.2572752 -0.5817805
3 -0.6154720 0.009999363 0.94581000 -0.3185212 -0.1178895 0.6213662
4 -0.2842774 0.198945600 -0.09183320 0.3496279 -0.2989097 1.5136960
5 -0.6929984 -0.845707200 -0.17749680 -0.1664908 1.4831550 -1.6879460
6 1.1567370 0.241774500 0.08863952 0.1829540 0.9426771 -0.2096004
ID37 ID38 ID39 ID40
1 1.32433500 0.4681471 1.06110000 1.6559700
2 -0.16988710 -0.5423036 0.31293890 -1.2843770
3 -0.07076396 0.4016818 -0.01622713 -0.5265532
4 0.67118470 0.0108553 -1.04368900 1.6252750
5 -0.14142960 0.2007785 -0.67594210 2.2206110
6 0.53626210 -1.1852260 -0.42274760 0.6243603
grp = factor(rep(c(1, 0), each = 20))
regression <- function(y) {
sum <- summary(lm(y ~ grp))
pv <- sum$coefficients[2, 4]
return(pv)
}
out <- tibble(gene = seq(1, nrow(gene_express)),
p_values = unlist(purrr:: map(1:nrow(gene_express), ~regression(as.matrix(gene_express)[.x, ]))))out %>% arrange(p_values) %>% head(10)# A tibble: 10 × 2
gene p_values
<int> <dbl>
1 502 1.43e-12
2 589 3.19e-12
3 600 9.81e-12
4 590 6.28e-11
5 565 1.74e-10
6 551 1.10e- 9
7 593 1.19e- 9
8 584 1.65e- 9
9 538 2.42e- 9
10 569 2.69e- 9
# sig <- out %>% arrange(p_values) %>% filter(p_values < 0.05/nrow(Ch12Ex13))
sig <- out %>% arrange(p_values) %>% filter(p_values < 0.05 )# install.packages("pheatmap")
library(pheatmap)
# install.packages("ggplotify")
library(ggplotify) ## to convert pheatmap to ggplot2
# install.packages("heatmaply")
library(heatmaply) ## for constructing interactive heatmap#create data frame for annotations
dfh <- data.frame(sample=as.character(colnames(gene_express)), status = "disease") %>%
column_to_rownames("sample")
dfh$status[seq(21, 40)] <- "healthy"
dfh status
ID1 disease
ID2 disease
ID3 disease
ID4 disease
ID5 disease
ID6 disease
ID7 disease
ID8 disease
ID9 disease
ID10 disease
ID11 disease
ID12 disease
ID13 disease
ID14 disease
ID15 disease
ID16 disease
ID17 disease
ID18 disease
ID19 disease
ID20 disease
ID21 healthy
ID22 healthy
ID23 healthy
ID24 healthy
ID25 healthy
ID26 healthy
ID27 healthy
ID28 healthy
ID29 healthy
ID30 healthy
ID31 healthy
ID32 healthy
ID33 healthy
ID34 healthy
ID35 healthy
ID36 healthy
ID37 healthy
ID38 healthy
ID39 healthy
ID40 healthy
pheatmap(gene_express[sig$gene, ], cluster_rows = FALSE, cluster_cols = T, scale="row", annotation_col = dfh,
annotation_colors=list(status = c(disease = "orange", healthy = "black")),
color=colorRampPalette(c("navy", "white", "red"))(50))# Transpose 'gene_express' to have samples as rows and genes as columns
gene_express_t <- t(gene_express)
# Convert to data frame and add 'status' information
gene_express_df <- as.data.frame(gene_express_t) %>%
mutate(status = dfh$status)2.1 12.6.13 (b) (30 pts)
There are several methods of “dissimilarity” (correlation based distance) which can be explored for creation of distance matrix. We now plot the hierarchical clustering dendrogram using complete, single, ward centroid and average linkage clustering using correlation based distance. single linkage:
library(tidyverse)
library(tidymodels)
library(readr)
library(tswge)
library(ggplot2)
library(yardstick)
library(workflows)
library(parsnip)
library(tidyclust)
library(RcppHungarian)
library(embed)
library(tidytext)
set.seed(838383)
hc_spec_single <- hier_clust(
num_clusters = 2,
linkage_method = "single"
)
hc_fit_single <- hc_spec_single %>%
fit(~ .,
data = gene_express_df
)
hc_fit_single %>%
summary() Length Class Mode
spec 4 hier_clust list
fit 7 hclust list
elapsed 1 -none- list
preproc 4 -none- list
hc_fit_single$fit %>% plot(main=" Single Linkage
with Correlation - Based Distance ", xlab="", sub ="")average linkage:
set.seed(838383)
hc_spec_average <- hier_clust(
num_clusters = 2,
linkage_method = "average"
)
hc_fit_average <- hc_spec_average %>%
fit(~ .,
data = gene_express_df
)
hc_fit_average %>%
summary() Length Class Mode
spec 4 hier_clust list
fit 7 hclust list
elapsed 1 -none- list
preproc 4 -none- list
hc_fit_average$fit %>% plot(main=" Average Linkage
with Correlation - Based Distance ", xlab="", sub ="")complete linkage:
set.seed(838383)
hc_spec_complete <- hier_clust(
num_clusters = 2,
linkage_method = "complete"
)
hc_fit_complete <- hc_spec_complete %>%
fit(~ .,
data = gene_express_df
)
hc_fit_complete %>%
summary() Length Class Mode
spec 4 hier_clust list
fit 7 hclust list
elapsed 1 -none- list
preproc 4 -none- list
hc_fit_complete$fit %>% plot(main=" Complete Linkage
with Correlation - Based Distance ", xlab="", sub ="")centroid linkage:
set.seed(838383)
hc_spec_centroid <- hier_clust(
num_clusters = 2,
linkage_method = "centroid"
)
hc_fit_centroid <- hc_spec_centroid %>%
fit(~ .,
data = gene_express_df
)
hc_fit_centroid %>%
summary() Length Class Mode
spec 4 hier_clust list
fit 7 hclust list
elapsed 1 -none- list
preproc 4 -none- list
hc_fit_centroid$fit %>% plot(main=" Centroid Linkage
with Correlation - Based Distance ", xlab="", sub ="")ward linkage:
set.seed(838383)
hc_spec_ward <- hier_clust(
num_clusters = 2,
linkage_method = "ward.D"
)
hc_fit_ward<- hc_spec_ward %>%
fit(~ .,
data = gene_express_df
)
hc_fit_ward %>%
summary() Length Class Mode
spec 4 hier_clust list
fit 7 hclust list
elapsed 1 -none- list
preproc 4 -none- list
hc_fit_ward$fit %>% plot(main=" Ward Linkage
with Correlation - Based Distance ", xlab="", sub ="")Inference: Yes, the genes separate the samples into the two groups. The results depend on the type of linkage used, but not too much. As we see, we obtain two clusters for average, complete and single, ward linkages except for centroid linkage. Typically, single linkage will tend to yield trailing clusters: very large clusters onto which individual observations attach one-by-one. On the other hand, complete ward and average linkage tend to yield more balanced, attractive clusters. For this reason, complete, ward and average linkage are generally preferred to single and centroid linkage.
2.2 PCA and UMAP (30 pts)
PCA
library(recipes)
library(tidymodels)
# Setup the recipe for PCA
pca_recipe <- recipe(~ ., data = gene_express_df) %>%
update_role(status, new_role = "ID") %>% # Mark 'status' as an identifier
step_normalize(all_predictors(), -all_outcomes()) %>%
step_pca(all_predictors(), -all_outcomes())
# Prepare the recipe
pca_prep <- prep(pca_recipe)
# Optionally, summarize the prepared recipe
summary(pca_prep)# A tibble: 6 × 4
variable type role source
<chr> <list> <chr> <chr>
1 status <chr [3]> ID original
2 PC1 <chr [2]> predictor derived
3 PC2 <chr [2]> predictor derived
4 PC3 <chr [2]> predictor derived
5 PC4 <chr [2]> predictor derived
6 PC5 <chr [2]> predictor derived
library(tidytext)
tidied_pca <- tidy(pca_prep, 2)
tidied_pca %>%
filter(component %in% paste0("PC", 1:4)) %>%
group_by(component) %>%
top_n(8, abs(value)) %>%
ungroup() %>%
mutate(terms = reorder_within(terms, abs(value), component)) %>%
ggplot(aes(abs(value), terms, fill = value > 0)) +
geom_col() +
facet_wrap(~component, scales = "free_y") +
scale_y_reordered() +
labs(
x = "Absolute value of contribution",
y = NULL, fill = "Positive?"
)juice(pca_prep) %>%
ggplot(aes(PC1, PC2, label = status)) +
geom_point(aes(color = status), alpha = 0.7, size = 2) +
#geom_text(check_overlap = TRUE, hjust = "inward") +
labs(x = "Principal Component 1", y = "Principal Component 2", title = "PCA of Gene Expression Data") +
theme_minimal()UMAP
library(embed)
library(Matrix)
library(irlba)
umap_rec <- recipe(~., data = gene_express_df) %>%
update_role(status, new_role = "id") %>%
step_normalize(all_predictors()) %>%
step_umap(all_predictors())
umap_prep <- prep(umap_rec)
umap_prepjuice(umap_prep) %>%
ggplot(aes(UMAP1, UMAP2, label = status)) +
geom_point(aes(color = status), alpha = 0.7, size = 2) +
# geom_text(check_overlap = TRUE, hjust = "inward") +
labs(x = "UMAP1", y = "UMAP2", title = "umap")2.3 12.6.13 (c) (30 pts)
grp = factor(rep(c(1, 0), each = 20))
regression <- function(y) {
sum <- summary(lm(y ~ grp))
pv <- sum$coefficients[2, 4]
return(pv)
}
out <- tibble(gene = seq(1, nrow(gene_express)),
p_values = unlist(purrr:: map(1:nrow(gene_express), ~regression(as.matrix(gene_express)[.x, ]))))out %>% arrange(p_values) %>% head(10)# A tibble: 10 × 2
gene p_values
<int> <dbl>
1 502 1.43e-12
2 589 3.19e-12
3 600 9.81e-12
4 590 6.28e-11
5 565 1.74e-10
6 551 1.10e- 9
7 593 1.19e- 9
8 584 1.65e- 9
9 538 2.42e- 9
10 569 2.69e- 9
# sig <- out %>% arrange(p_values) %>% filter(p_values < 0.05/nrow(gene_express))
sig <- out %>% arrange(p_values) %>% filter(p_values < 0.05 )In conclusion, 502, 589, 600,590, 565, 551, 593, 584, 538, 569 are the genes that are significantly different between the two groups (healthy and diseased) as indicated by the adjusted p-values being below 0.05.
# install.packages("pheatmap")
library(pheatmap)
# install.packages("ggplotify")
library(ggplotify) ## to convert pheatmap to ggplot2
# install.packages("heatmaply")
library(heatmaply) ## for constructing interactive heatmap#create data frame for annotations
dfh <- data.frame(sample=as.character(colnames(gene_express)), status = "disease") %>%
column_to_rownames("sample")
dfh$status[seq(21, 40)] <- "healthy"
dfh status
ID1 disease
ID2 disease
ID3 disease
ID4 disease
ID5 disease
ID6 disease
ID7 disease
ID8 disease
ID9 disease
ID10 disease
ID11 disease
ID12 disease
ID13 disease
ID14 disease
ID15 disease
ID16 disease
ID17 disease
ID18 disease
ID19 disease
ID20 disease
ID21 healthy
ID22 healthy
ID23 healthy
ID24 healthy
ID25 healthy
ID26 healthy
ID27 healthy
ID28 healthy
ID29 healthy
ID30 healthy
ID31 healthy
ID32 healthy
ID33 healthy
ID34 healthy
ID35 healthy
ID36 healthy
ID37 healthy
ID38 healthy
ID39 healthy
ID40 healthy
pheatmap(gene_express[sig$gene, ], cluster_rows = FALSE, cluster_cols = T, scale="row", annotation_col = dfh,
annotation_colors=list(status = c(disease = "orange", healthy = "black")),
color=colorRampPalette(c("navy", "white", "red"))(50))